Load R packages

library(tidyverse) ; library(reshape2) ; library(glue) ; library(plotly)
library(RColorBrewer) ; library(viridis) ; require(gridExtra) ; library(GGally) ; library(ggpubr)
library(rstatix)
library(expss)
library(DESeq2)
library(knitr) ; library(kableExtra)

Load preprocessed dataset (code in 2.1.Preprocessing_pipeline.Rmd)

# SFARI Genes
SFARI_genes = read_csv('./../../SFARI/Data/SFARI_genes_01-03-2020_w_ensembl_IDs.csv')

# Load Gandal dataset
load('./../Data/preprocessedData/preprocessed_data.RData')
datExpr = datExpr %>% data.frame

# Updates genes_info with SFARI information
genes_info = genes_info %>% left_join(SFARI_genes, by = 'ID') %>%
             mutate(gene.score = ifelse(is.na(`gene-score`) & Neuronal==0, 'Others', 
                                        ifelse(is.na(`gene-score`), 'Neuronal', `gene-score`))) %>%
             mutate(Group = factor(ifelse(gene.score %in% c('Neuronal','Others'), gene.score, 'SFARI'), 
                    levels = c('SFARI', 'Neuronal', 'Others')))

# SFARI palette
SFARI_colour_hue = function(r) {
  pal = c('#FF7631','#FFB100','#E8E328','#8CC83F','#62CCA6','#59B9C9','#b3b3b3','#808080','gray','#d9d9d9')[r]
}




There are 864 SFARI Genes in the expression dataset (~77%), 789 of them have a SFARI Gene Score.

Gene count by SFARI score:

table_info = genes_info %>% apply_labels(`gene-score` = 'SFARI Gene Score', syndromic = 'Syndromic Tag',
                                          Neuronal = 'Neuronal Function', gene.score = 'Gene Score') %>%
             mutate(syndromic = as.logical(syndromic), Neuronal = as.logical(Neuronal))

cro(table_info$`gene-score`)
 #Total 
 SFARI Gene Score 
   1  144
   2  205
   3  440
   #Total cases  789


Gene count by Syndromic tag:

cro(table_info$syndromic)
 #Total 
 Syndromic Tag 
   FALSE  748
   TRUE  116
   #Total cases  864


Neuronal annotations:


1545 genes have neuronal-related annotations, 201 of these, have a SFARI score

cro(table_info$gene.score[genes_info$`gene-score` %in% as.character(c(1:3))],
    list(table_info$Neuronal[genes_info$`gene-score` %in% as.character(c(1:3))], total()))
 Neuronal Function     #Total 
 FALSE   TRUE   
 Gene Score 
   1  97 47   144
   2  146 59   205
   3  345 95   440
   #Total cases  588 201   789
rm(table_info)




2.4.1 Analysis of all SFARI Genes together


Mean Expression

plot_data = data.frame('ID'=rownames(datExpr), 'MeanExpr'=rowMeans(datExpr)) %>% 
            left_join(genes_info, by='ID')

wt = plot_data %>% wilcox_test(MeanExpr~Group, p.adjust.method='BH') %>% add_x_position(x = 'group')
increase = 1
base = 15.7
pos_y_comparisons = c(base, base+increase, base)
plot_data %>% ggplot(aes(Group, MeanExpr)) + 
     geom_boxplot(outlier.colour='#cccccc', outlier.shape='o', outlier.size=3, aes(fill=Group)) + 
     stat_pvalue_manual(wt, y.position = pos_y_comparisons, tip.length = .03) +
     scale_fill_manual(values=c('#00A4F7', SFARI_colour_hue(r=c(8,7)))) + 
     xlab('') + ylab('Mean Expression') + #ggtitle('Mean Expression Comparison') +
     scale_x_discrete(labels=c('SFARI' = 'SFARI\nGenes', 'Others' = 'Other\ngenes',
                              'Neuronal' = 'Neuronal\ngenes')) +
     theme_minimal() + theme(legend.position='none')

Log Fold Change

The propotion of over and underexpressed genes is very simiar for each group, so we can just focus on the magnitude of the LFC independently of the sign.

genes_info %>% mutate(direction = ifelse(log2FoldChange>0, 'overexpressed', 'underexpressed')) %>% 
               group_by(Group, direction) %>% tally(name = 'overexpressed') %>% 
               filter(direction == 'overexpressed') %>% ungroup %>% 
               left_join(genes_info %>% group_by(Group) %>% tally(name = 'Total'), by = 'Group') %>% ungroup %>%
               mutate('underexpressed' = Total - overexpressed , 
                      'ratio' = round(overexpressed/underexpressed,2)) %>% 
               dplyr::select(Group, overexpressed, underexpressed, ratio) %>% 
               kable %>% kable_styling(full_width = F)
Group overexpressed underexpressed ratio
SFARI 379 410 0.92
Neuronal 673 671 1.00
Others 7194 6790 1.06


plot_data = data.frame('ID'=rownames(datExpr), 'MeanExpr'=rowMeans(datExpr)) %>% 
            left_join(genes_info, by='ID')

wt = plot_data %>% filter(log2FoldChange>0) %>% 
     wilcox_test(MeanExpr~Group, p.adjust.method='BH') %>% add_x_position(x = 'group')
increase = 1
base = 15.7
pos_y_comparisons = c(base, base+increase, base)
p1 = plot_data %>% filter(log2FoldChange>0) %>% ggplot(aes(Group, MeanExpr)) +
     geom_boxplot(outlier.colour='#cccccc', outlier.shape='o', outlier.size=3, aes(fill=Group)) + 
     stat_pvalue_manual(wt, y.position = pos_y_comparisons, tip.length = .03) +
     scale_fill_manual(values=c('#00A4F7', SFARI_colour_hue(r=c(8,7)))) + 
     xlab('') + ylab('Mean Expression') + ggtitle('Overexpressed genes') +
     scale_x_discrete(labels=c('SFARI' = 'SFARI\nGenes', 'Others' = 'Other\ngenes',
                              'Neuronal' = 'Neuronal\ngenes')) +
     theme_minimal() + theme(legend.position='none')

wt = plot_data %>% filter(log2FoldChange<0) %>% 
     wilcox_test(MeanExpr~Group, p.adjust.method='BH') %>% add_x_position(x = 'group')
p2 = plot_data %>% filter(log2FoldChange<0) %>% ggplot(aes(Group, MeanExpr)) +
     geom_boxplot(outlier.colour='#cccccc', outlier.shape='o', outlier.size=3, aes(fill=Group)) + 
     stat_pvalue_manual(wt, y.position = pos_y_comparisons, tip.length = .03) +
     scale_fill_manual(values=c('#00A4F7', SFARI_colour_hue(r=c(8,7)))) + 
     xlab('') + ylab('Mean Expression') + ggtitle('Underexpressed genes') +
     scale_x_discrete(labels=c('SFARI' = 'SFARI\nGenes', 'Others' = 'Other\ngenes',
                              'Neuronal' = 'Neuronal\ngenes')) +
     theme_minimal() + theme(legend.position='none')

grid.arrange(p1, p2, nrow=1)


Both with the original LFC values as well as the shrunken ones, the difference in LFC magnitude between SFARI genes and Neuronal genes is statisically significant with a p-value lower than \(10^{-4}\).

wt = genes_info %>% mutate(abs_lfc = abs(log2FoldChange), 
                           Group = factor(Group, levels = c('SFARI','Others', 'Neuronal'))) %>% 
     wilcox_test(abs_lfc~Group, p.adjust.method='BH') %>% add_x_position(x = 'group')

increase = 0.04
base = 0.45
pos_y_comparisons = c(base, base + increase, base)
p1 = genes_info %>% mutate(Group = factor(Group, levels = c('SFARI','Others', 'Neuronal'))) %>% 
     ggplot(aes(Group, abs(log2FoldChange))) + 
     geom_boxplot(outlier.colour='#cccccc', outlier.shape='o', outlier.size=3, aes(fill=Group)) + 
     stat_pvalue_manual(wt, y.position = pos_y_comparisons, tip.length = .005) + 
     scale_fill_manual(values=c('#00A4F7', SFARI_colour_hue(r=c(7,8)))) + 
     coord_cartesian(ylim= c(0, max(pos_y_comparisons))) + 
     scale_x_discrete(labels=c('SFARI' = 'SFARI\nGenes', 'Others' = 'Other\ngenes',
                              'Neuronal' = 'Neuronal\ngenes')) +
     xlab('') + ylab('Original LFC Magnitude') + theme_minimal() + theme(legend.position='none')


wt = genes_info %>% mutate(abs_lfc = abs(shrunken_log2FoldChange), 
                           Group = factor(Group, levels = c('SFARI','Others', 'Neuronal'))) %>% 
     wilcox_test(abs_lfc~Group, p.adjust.method='BH') %>% add_x_position(x = 'group')

increase = 0.03
base = 0.35
pos_y_comparisons = c(base, base + increase, base)
p2 = genes_info %>% mutate(Group = factor(Group, levels = c('SFARI','Others', 'Neuronal'))) %>% 
     ggplot(aes(Group, abs(shrunken_log2FoldChange))) + 
     geom_boxplot(outlier.colour='#cccccc', outlier.shape='o', outlier.size=3, aes(fill=Group)) + 
     stat_pvalue_manual(wt, y.position = pos_y_comparisons, tip.length = .01) + 
     scale_fill_manual(values=c('#00A4F7', SFARI_colour_hue(r=c(7,8)))) + 
     coord_cartesian(ylim= c(0, max(pos_y_comparisons))) + 
     scale_x_discrete(labels=c('SFARI' = 'SFARI\nGenes', 'Others' = 'Other\ngenes',
                              'Neuronal' = 'Neuronal\ngenes')) +
     xlab('') + ylab('Shrunken LFC Magnitude') + theme_minimal() + theme(legend.position='none')

grid.arrange(p1, p2, nrow = 1)

rm(increase, base, pos_y_comparisons, wt)

Full plots, without cropping out outliers

p1 = ggplotly(genes_info %>% mutate(Group = factor(Group, levels = c('SFARI','Others', 'Neuronal'))) %>% 
     ggplot(aes(Group, abs(log2FoldChange))) + 
     geom_boxplot(outlier.colour='#cccccc', outlier.shape='o', outlier.size=3, aes(fill=Group)) + 
     scale_fill_manual(values=c('#00A4F7', SFARI_colour_hue(r=c(7,8)))) + 
     scale_x_discrete(labels=c('SFARI' = 'SFARI\nGenes', 'Others' = 'Other\ngenes',
                              'Neuronal' = 'Neuronal\ngenes')) +
     xlab('') + ylab('Original LFC Magnitude') + theme_minimal() + theme(legend.position='none'))
  
p2 = ggplotly(genes_info %>% mutate(Group = factor(Group, levels = c('SFARI','Others', 'Neuronal'))) %>% 
     ggplot(aes(Group, abs(shrunken_log2FoldChange))) + 
     geom_boxplot(outlier.colour='#cccccc', outlier.shape='o', outlier.size=3, aes(fill=Group)) + 
     scale_fill_manual(values=c('#00A4F7', SFARI_colour_hue(r=c(7,8)))) + 
     scale_x_discrete(labels=c('SFARI' = 'SFARI\nGenes', 'Others' = 'Other\ngenes',
                              'Neuronal' = 'Neuronal\ngenes')) +
     xlab('') + ylab('Shrunken LFC Magnitude') + theme_minimal() + theme(legend.position='none'))
  

subplot(p1, p2, nrows = 1)


Differential Expression

genes_info %>% group_by(Group, significant) %>% tally(name = 'DE') %>% filter(significant) %>% ungroup %>%
               left_join(genes_info %>% group_by(Group) %>% tally(name = 'Total'), by = 'Group') %>% ungroup %>%
               mutate('prop_DE' = round(DE/Total,2)) %>% dplyr::select(Group, DE, prop_DE, Total) %>% 
               kable(caption = 'Proportion of DE Genes by Group') %>% kable_styling(full_width = F)
Proportion of DE Genes by Group
Group DE prop_DE Total
SFARI 242 0.31 789
Neuronal 482 0.36 1344
Others 3749 0.27 13984


lfc_list = seq(0, 0.3, 0.01)

all_counts = data.frame('group'='All', 'n'=as.character(nrow(genes_info)))
Others_counts = data.frame('group'='Others', n=as.character(sum(genes_info$Group == 'Others')))
Neuronal_counts = data.frame('group'='Neuronal', n=as.character(sum(genes_info$Group == 'Neuronal')))
lfc_counts_all = genes_info %>% filter(Group == 'SFARI') %>% tally %>%
                 mutate('group'='SFARI', 'n'=as.character(n)) %>% 
                 dplyr::select(group, n) %>%
                 bind_rows(Neuronal_counts, Others_counts, all_counts) %>%
                 mutate('lfc'=-1) %>% dplyr::select(lfc, group, n)

for(lfc in lfc_list){
  
  # Recalculate genes_info with the new threshold (p-values change)
  DE_genes = results(dds, lfcThreshold=lfc, altHypothesis='greaterAbs') %>% data.frame %>% 
             mutate('ID' = rownames(.)) %>% 
             left_join(genes_info %>% dplyr::select(ID, Neuronal, gene.score, Group), by = 'ID') %>% 
             filter(padj<0.05 & abs(log2FoldChange)>lfc)

  
  # Calculate counts by groups
  all_counts = data.frame('group'='All', 'n'=as.character(nrow(DE_genes)))
  Others_counts = data.frame('group'='Others', n=as.character(sum(DE_genes$Group == 'Others')))
  Neuronal_counts = data.frame('group'='Neuronal', n=as.character(sum(DE_genes$Group == 'Neuronal')))
  lfc_counts = DE_genes %>% filter(Group == 'SFARI') %>% tally %>%
               mutate('group'='SFARI', 'n'=as.character(n)) %>%
               bind_rows(Neuronal_counts, Others_counts, all_counts) %>%
               mutate('lfc'=lfc) %>% dplyr::select(lfc, group, n)
  
  
  # Update lfc_counts_all
  lfc_counts_all = lfc_counts_all %>% bind_rows(lfc_counts)
}

# Add missing entries with 0s
lfc_counts_all = expand.grid('group'=unique(lfc_counts_all$group), 'lfc'=unique(lfc_counts_all$lfc)) %>% 
  left_join(lfc_counts_all, by=c('group','lfc')) %>% replace(is.na(.), 0)

# Calculate percentage of each group remaining
tot_counts = genes_info %>% filter(Group == 'SFARI') %>% tally() %>%
             mutate('group'='SFARI', 'tot'=n) %>% dplyr::select(group, tot) %>%
             bind_rows(data.frame('group'='Neuronal', 'tot'=sum(genes_info$Group == 'Neuronal')),
                       data.frame('group' = 'Others', 'tot' = sum(genes_info$Group == 'Others')),
                       data.frame('group'='All', 'tot'=nrow(genes_info)))

lfc_counts_all = lfc_counts_all %>% filter(lfc!=-1) %>% #, group!='Others') %>% 
                 left_join(tot_counts, by='group') %>% mutate('perc'=round(100*as.numeric(n)/tot,2))


# Plot change of number of genes
ggplotly(lfc_counts_all %>% filter(group != 'All') %>% 
         mutate(group = factor(group, levels = c('SFARI', 'Neuronal', 'Others'))) %>%
         ggplot(aes(lfc, perc, color=group)) + geom_point(aes(id=n)) + geom_line() +
         scale_color_manual(values=c('#00A4F7', SFARI_colour_hue(r=c(8,7)))) + 
         ylab('Percentage of DE Genes') +  xlab('LFC threshold') + labs(color = 'Group') + 
         #ggtitle('Effect of filtering thresholds in SFARI Genes') + 
         theme_minimal() + theme(legend.position = 'bottom'))
rm(lfc_list, all_counts, Others_counts, Neuronal_counts, lfc_counts, lfc_counts_all, DE_genes, tot_counts)




2.4.2 Analysis of SFARI Genes by SFARI Score


Mean Expression

plot_data = data.frame('ID'=rownames(datExpr), 'MeanExpr'=rowMeans(datExpr)) %>% left_join(genes_info, by='ID')

wt = plot_data %>% wilcox_test(MeanExpr~gene.score, p.adjust.method='BH') %>% add_x_position(x = 'group')
increase = 1
base = 15.5
pos_y_comparisons = c(base + c(0,1,3,5)*increase, base+c(0,2,4)*increase, base+0:1*increase, base)
plot_data %>% ggplot(aes(gene.score, MeanExpr)) + 
              geom_boxplot(outlier.colour='#cccccc', outlier.shape='o', outlier.size=3, aes(fill=gene.score)) +
              stat_pvalue_manual(wt, y.position = pos_y_comparisons, tip.length = .01) +
              scale_fill_manual(values=SFARI_colour_hue(r=c(1:3,8,7))) + 
              xlab('') + ylab('Mean Expression') + 
              scale_x_discrete(labels=c('1' = 'SFARI\nScore 1', '2' = 'SFARI\nScore 2', '3' = 'SFARI\nScore 3', 
                                        'Others' = 'Other\ngenes', 'Neuronal' = 'Neuronal\ngenes')) +
              theme_minimal() + theme(legend.position='none')

Log Fold Change

genes_info %>% mutate(direction = ifelse(log2FoldChange>0, 'overexpressed', 'underexpressed')) %>% 
               group_by(gene.score, direction) %>% tally(name = 'overexpressed') %>% 
               filter(direction == 'overexpressed') %>% ungroup %>% 
               left_join(genes_info %>% group_by(gene.score) %>% tally(name = 'Total'), by = 'gene.score') %>% ungroup %>%
               mutate('underexpressed' = Total - overexpressed , 
                      'ratio' = round(overexpressed/underexpressed,2)) %>% 
               dplyr::select(gene.score, overexpressed, underexpressed, ratio) %>% 
               kable %>% kable_styling(full_width = F)
gene.score overexpressed underexpressed ratio
1 66 78 0.85
2 102 103 0.99
3 211 229 0.92
Neuronal 673 671 1.00
Others 7194 6790 1.06


plot_data = data.frame('ID'=rownames(datExpr), 'MeanExpr'=rowMeans(datExpr)) %>% 
            left_join(genes_info, by='ID')

wt = plot_data %>% filter(log2FoldChange>0) %>% 
     wilcox_test(MeanExpr~gene.score, p.adjust.method='BH') %>% add_x_position(x = 'group')
increase = 1
base = 15.5
pos_y_comparisons = c(base + c(0,1,3,5)*increase, base+c(0,2,4)*increase, base+0:1*increase, base)
p1 = plot_data %>% filter(log2FoldChange>0) %>% ggplot(aes(gene.score, MeanExpr)) +
     geom_boxplot(outlier.colour='#cccccc', outlier.shape='o', outlier.size=3, aes(fill=gene.score)) + 
     stat_pvalue_manual(wt, y.position = pos_y_comparisons, tip.length = .03) +
     scale_fill_manual(values=SFARI_colour_hue(r=c(1:3,8,7))) + 
     xlab('') + ylab('Mean Expression') + ggtitle('Overexpressed genes') +
     scale_x_discrete(labels=c('1' = 'SFARI\nScore 1', '2' = 'SFARI\nScore 2', '3' = 'SFARI\nScore 3', 
                                        'Others' = 'Other\ngenes', 'Neuronal' = 'Neuronal\ngenes')) +
     theme_minimal() + theme(legend.position='none')

wt = plot_data %>% filter(log2FoldChange<0) %>% 
     wilcox_test(MeanExpr~gene.score, p.adjust.method='BH') %>% add_x_position(x = 'group')
p2 = plot_data %>% filter(log2FoldChange<0) %>% ggplot(aes(gene.score, MeanExpr)) +
     geom_boxplot(outlier.colour='#cccccc', outlier.shape='o', outlier.size=3, aes(fill=gene.score)) + 
     stat_pvalue_manual(wt, y.position = pos_y_comparisons, tip.length = .03) +
      scale_fill_manual(values=SFARI_colour_hue(r=c(1:3,8,7))) + 
     xlab('') + ylab('Mean Expression') + ggtitle('Underexpressed genes') +
     scale_x_discrete(labels=c('1' = 'SFARI\nScore 1', '2' = 'SFARI\nScore 2', '3' = 'SFARI\nScore 3', 
                                        'Others' = 'Other\ngenes', 'Neuronal' = 'Neuronal\ngenes')) +
     theme_minimal() + theme(legend.position='none')

grid.arrange(p1, p2, nrow=1)


Both with the original LFC values as well as the shrunken ones, the difference in LFC magnitude between SFARI genes and Neuronal genes is statisically significant with a p-value lower than \(10^{-4}\).

wt = genes_info %>% mutate(abs_lfc = abs(log2FoldChange), 
                           Group = factor(gene.score, levels = c('1','2','3','Others','Neuronal'))) %>% 
     wilcox_test(abs_lfc~Group, p.adjust.method='BH') %>% add_x_position(x = 'group')

increase = 0.04
base = 0.45
pos_y_comparisons = c(base + c(0,1,3,5)*increase, base+c(0,2,4)*increase, base+0:1*increase, base)
p1 = genes_info %>% mutate(gene.score = factor(gene.score, levels = c('1','2','3','Others','Neuronal'))) %>%
     ggplot(aes(gene.score, abs(log2FoldChange))) + 
     geom_boxplot(outlier.colour='#cccccc', outlier.shape='o', outlier.size=3, aes(fill=gene.score)) + 
     stat_pvalue_manual(wt, y.position = pos_y_comparisons, tip.length = .005) + 
     scale_fill_manual(values=SFARI_colour_hue(r=c(1:3,7,8))) + 
     coord_cartesian(ylim= c(0, max(pos_y_comparisons))) + 
     scale_x_discrete(labels=c('1' = 'SFARI\nScore 1', '2' = 'SFARI\nScore 2', '3' = 'SFARI\nScore 3', 
                               'Neuronal' = 'Neuronal\ngenes','Others' = 'Other\ngenes')) +
     xlab('') + ylab('Original LFC Magnitude') + theme_minimal() + theme(legend.position='none')


wt = genes_info %>% mutate(abs_lfc = abs(shrunken_log2FoldChange), 
                           Group = factor(gene.score, levels = c('1','2','3','Others', 'Neuronal'))) %>% 
     wilcox_test(abs_lfc~Group, p.adjust.method='BH') %>% add_x_position(x = 'group')

increase = 0.03
base = 0.35
pos_y_comparisons = c(base + c(0,1,3,5)*increase, base+c(0,2,4)*increase, base+0:1*increase, base)
p2 = genes_info %>% mutate(gene.score = factor(gene.score, levels = c('1','2','3','Others','Neuronal'))) %>%
     ggplot(aes(gene.score, abs(shrunken_log2FoldChange))) + 
     geom_boxplot(outlier.colour='#cccccc', outlier.shape='o', outlier.size=3, aes(fill=gene.score)) + 
     stat_pvalue_manual(wt, y.position = pos_y_comparisons, tip.length = .01) + 
     scale_fill_manual(values=SFARI_colour_hue(r=c(1:3,7,8))) + 
     coord_cartesian(ylim= c(0, max(pos_y_comparisons))) + 
     scale_x_discrete(labels=c('1' = 'SFARI\nScore 1', '2' = 'SFARI\nScore 2', '3' = 'SFARI\nScore 3', 
                               'Neuronal' = 'Neuronal\ngenes', 'Others' = 'Other\ngenes')) +
     xlab('') + ylab('Shrunken LFC Magnitude') + theme_minimal() + theme(legend.position='none')

grid.arrange(p1, p2, nrow = 1)

rm(increase, base, pos_y_comparisons, wt)


Full plots, without cropping out outliers

p1 = genes_info %>% mutate(gene.score = factor(gene.score, levels = c('1','2','3','Others','Neuronal'))) %>%
     ggplot(aes(gene.score, abs(log2FoldChange))) + 
     geom_boxplot(outlier.colour='#cccccc', outlier.shape='o', outlier.size=3, aes(fill=gene.score)) + 
     scale_fill_manual(values=SFARI_colour_hue(r=c(1:3,7,8))) + 
     scale_x_discrete(labels=c('1' = 'SFARI\nScore 1', '2' = 'SFARI\nScore 2', '3' = 'SFARI\nScore 3', 
                               'Neuronal' = 'Neuronal\ngenes','Others' = 'Other\ngenes')) +
     xlab('') + ylab('Original LFC Magnitude') + theme_minimal() + theme(legend.position='none')

p2 = genes_info %>% mutate(gene.score = factor(gene.score, levels = c('1','2','3','Others','Neuronal'))) %>%
     ggplot(aes(gene.score, abs(shrunken_log2FoldChange))) + 
     geom_boxplot(outlier.colour='#cccccc', outlier.shape='o', outlier.size=3, aes(fill=gene.score)) + 
     scale_fill_manual(values=SFARI_colour_hue(r=c(1:3,7,8))) + 
     scale_x_discrete(labels=c('1' = 'SFARI\nScore 1', '2' = 'SFARI\nScore 2', '3' = 'SFARI\nScore 3', 
                               'Neuronal' = 'Neuronal\ngenes', 'Others' = 'Other\ngenes')) +
     xlab('') + ylab('Shrunken LFC Magnitude') + theme_minimal() + theme(legend.position='none')

subplot(ggplotly(p1), ggplotly(p2), nrows=1)
rm(increase, base, pos_y_comparisons, wt)
## Warning in rm(increase, base, pos_y_comparisons, wt): object 'increase' not
## found
## Warning in rm(increase, base, pos_y_comparisons, wt): object 'base' not found
## Warning in rm(increase, base, pos_y_comparisons, wt): object 'pos_y_comparisons'
## not found
## Warning in rm(increase, base, pos_y_comparisons, wt): object 'wt' not found


Differential Expression


lfc_list = seq(0, 0.3, 0.01)

all_counts = data.frame('group'='All', 'n'=as.character(nrow(genes_info)))
Others_counts = data.frame('group'='Others', n=as.character(sum(genes_info$gene.score == 'Others')))
Neuronal_counts = data.frame('group'='Neuronal', n=as.character(sum(genes_info$gene.score == 'Neuronal')))
lfc_counts_all = genes_info %>% group_by(`gene-score`) %>% filter(`gene-score` != 'Others') %>% tally %>%
                 mutate('group'=as.factor(`gene-score`), 'n'=as.character(n)) %>%
                 dplyr::select(group, n) %>%
                 bind_rows(Neuronal_counts, Others_counts, all_counts) %>%
                 mutate('lfc'=-1) %>%  dplyr::select(lfc, group, n)

for(lfc in lfc_list){
  
  # Recalculate genes_info with the new threshold (p-values change)
  DE_genes = results(dds, lfcThreshold=lfc, altHypothesis='greaterAbs') %>% data.frame %>%
             mutate('ID'=rownames(.)) %>% 
             left_join(genes_info %>% dplyr::select(ID, Neuronal, gene.score), by='ID') %>% 
             filter(padj<0.05 & abs(log2FoldChange)>lfc)

  
  # Calculate counts by groups
  all_counts = data.frame('group'='All', 'n'=as.character(nrow(DE_genes)))
  Others_counts = data.frame('group'='Others', n=as.character(sum(DE_genes$gene.score == 'Others')))
  Neuronal_counts = data.frame('group'='Neuronal', n=as.character(sum(DE_genes$gene.score == 'Neuronal')))
  lfc_counts = DE_genes %>% group_by(gene.score) %>% tally %>%
               mutate('group'=gene.score, 'n'=as.character(n)) %>%
               bind_rows(Neuronal_counts, all_counts) %>%
               mutate('lfc'=lfc) %>% dplyr::select(lfc, group, n)
  
  
  # Update lfc_counts_all
  lfc_counts_all = lfc_counts_all %>% bind_rows(lfc_counts)
}

# Add missing entries with 0s
lfc_counts_all = expand.grid('group'=unique(lfc_counts_all$group), 'lfc'=unique(lfc_counts_all$lfc)) %>% 
  left_join(lfc_counts_all, by=c('group','lfc')) %>% replace(is.na(.), 0)

# Calculate percentage of each group remaining
tot_counts = genes_info %>% group_by(`gene-score`) %>% tally() %>% filter(`gene-score`!='Others') %>%
             mutate('group'=factor(`gene-score`), 'tot'=n) %>% dplyr::select(group, tot) %>%
             bind_rows(data.frame('group'='Neuronal', 'tot'=sum(genes_info$gene.score == 'Neuronal')),
                       data.frame('group'='Others', 'tot'=sum(genes_info$gene.score == 'Others')),
                       data.frame('group'='All', 'tot'=nrow(genes_info)))

lfc_counts_all = lfc_counts_all %>% filter(lfc!=-1) %>% #, group!='Others') %>% 
                 left_join(tot_counts, by='group') %>% mutate('perc'=round(100*as.numeric(n)/tot,2))


# Plot change of number of genes
ggplotly(lfc_counts_all %>% filter(group != 'All') %>% 
         mutate(group = ifelse(group %in% c('1','2','3'), paste0('Score ',group), group)) %>% 
         mutate(group = factor(group, levels = c('Score 1', 'Score 2', 'Score 3', 'Neuronal', 'Others'))) %>%
         ggplot(aes(lfc, perc, color=group)) + 
         geom_point(aes(id=n)) + geom_line() + scale_color_manual(values=SFARI_colour_hue(r=c(1:3,8,7))) + 
         ylab('Percentage of DE Genes') +  xlab('LFC threshold') +  labs(color = 'Group') +
         #ggtitle('Effect of filtering thresholds by SFARI score') + 
         #guides(color=guide_legend(nrow=2,byrow=TRUE) + # for breaking the legend into two rows
         theme_minimal() + theme(legend.position = 'bottom'))
rm(lfc_list, all_counts, Neuronal_counts, lfc_counts_all, lfc, lfc_counts, lfc_counts_all, tot_counts,
   lfc_counts_all, Others_counts)

Session info

sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.5 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
## 
## locale:
##  [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
##  [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
##  [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] kableExtra_1.1.0            knitr_1.32                 
##  [3] DESeq2_1.24.0               SummarizedExperiment_1.14.1
##  [5] DelayedArray_0.10.0         BiocParallel_1.18.1        
##  [7] matrixStats_0.56.0          Biobase_2.44.0             
##  [9] GenomicRanges_1.36.1        GenomeInfoDb_1.20.0        
## [11] IRanges_2.18.3              S4Vectors_0.22.1           
## [13] BiocGenerics_0.30.0         expss_0.10.2               
## [15] rstatix_0.6.0               ggpubr_0.2.5               
## [17] magrittr_2.0.1              GGally_1.5.0               
## [19] gridExtra_2.3               viridis_0.5.1              
## [21] viridisLite_0.4.0           RColorBrewer_1.1-2         
## [23] plotly_4.9.2                glue_1.4.2                 
## [25] reshape2_1.4.4              forcats_0.5.0              
## [27] stringr_1.4.0               dplyr_1.0.1                
## [29] purrr_0.3.4                 readr_1.3.1                
## [31] tidyr_1.1.0                 tibble_3.1.2               
## [33] ggplot2_3.3.5               tidyverse_1.3.0            
## 
## loaded via a namespace (and not attached):
##   [1] readxl_1.3.1           backports_1.1.8        Hmisc_4.4-0           
##   [4] plyr_1.8.6             lazyeval_0.2.2         splines_3.6.3         
##   [7] crosstalk_1.1.0.1      digest_0.6.27          htmltools_0.5.1.1     
##  [10] fansi_0.5.0            checkmate_2.0.0        memoise_1.1.0         
##  [13] cluster_2.1.0          openxlsx_4.1.4         annotate_1.62.0       
##  [16] modelr_0.1.6           jpeg_0.1-8.1           colorspace_2.0-2      
##  [19] blob_1.2.1             rvest_0.3.5            haven_2.2.0           
##  [22] xfun_0.22              crayon_1.4.1           RCurl_1.98-1.2        
##  [25] jsonlite_1.7.2         genefilter_1.66.0      survival_3.2-7        
##  [28] gtable_0.3.0           zlibbioc_1.30.0        XVector_0.24.0        
##  [31] webshot_0.5.2          car_3.0-7              abind_1.4-5           
##  [34] scales_1.1.1           DBI_1.1.0              Rcpp_1.0.6            
##  [37] xtable_1.8-4           htmlTable_1.13.3       foreign_0.8-76        
##  [40] bit_4.0.4              Formula_1.2-3          htmlwidgets_1.5.1     
##  [43] httr_1.4.2             acepack_1.4.1          ellipsis_0.3.2        
##  [46] pkgconfig_2.0.3        reshape_0.8.8          XML_3.99-0.3          
##  [49] farver_2.1.0           nnet_7.3-14            sass_0.4.0            
##  [52] dbplyr_1.4.2           locfit_1.5-9.4         utf8_1.2.1            
##  [55] tidyselect_1.1.0       labeling_0.4.2         rlang_0.4.11          
##  [58] AnnotationDbi_1.46.1   munsell_0.5.0          cellranger_1.1.0      
##  [61] tools_3.6.3            cli_2.5.0              generics_0.0.2        
##  [64] RSQLite_2.2.0          broom_0.7.0            evaluate_0.14         
##  [67] yaml_2.2.1             bit64_4.0.5            fs_1.5.0              
##  [70] zip_2.0.4              xml2_1.2.5             compiler_3.6.3        
##  [73] rstudioapi_0.11        curl_4.3               png_0.1-7             
##  [76] ggsignif_0.6.0         reprex_0.3.0           geneplotter_1.62.0    
##  [79] bslib_0.2.5.1          stringi_1.5.3          highr_0.8             
##  [82] lattice_0.20-41        Matrix_1.2-18          vctrs_0.3.8           
##  [85] pillar_1.6.1           lifecycle_1.0.0        jquerylib_0.1.4       
##  [88] data.table_1.12.8      bitops_1.0-6           R6_2.5.0              
##  [91] latticeExtra_0.6-29    rio_0.5.16             assertthat_0.2.1      
##  [94] withr_2.4.2            GenomeInfoDbData_1.2.1 hms_0.5.3             
##  [97] grid_3.6.3             rpart_4.1-15           rmarkdown_2.7         
## [100] carData_3.0-3          lubridate_1.7.4        base64enc_0.1-3